A script that uses PCA and regression tree approaches to classify lakes

library(tidyverse)
library(readxl)
library(factoextra)
library(viridis)
library(vegan)
library(rpart)
library(partykit)
library(rpart.plot)
library(ggrepel)
library(lubridate)
library(ggpubr)
library(knitr)
physio <- read_excel(
  path = file.path("..",
                   "data",
                   "analysis_outputs",
                   "study-site-tables.xlsx"))

# Grab bigjoin for land cover
bigjoin <- readRDS(file = "../data/analysis_outputs/bigjoin.rds")


bigjoin_subset <- bigjoin %>%
  select(park_code, site_code, solar_jas, SWE_May, flush_index_noSWE,
         forest:snowice) %>%
  unique() %>%
  group_by(park_code, site_code) %>%
  summarize_all(.funs = ~ mean(., na.rm = TRUE)) %>%
  left_join(x = ., y = select(bigjoin, site_code, Aspect) %>% unique(),
            by = c("site_code"))

# Name matching table
match_sites <- readRDS(file = file.path("..",
                                        "data",
                                        "name_site_matches.rds"))

Select the static variables that go into the analyses

# Select continuous variables
physio_subset <- physio %>%
  select(Park_code, Lake, Elevation_m, Watershed_area_to_lake_area_ratio,
         Surface_area_ha, Watershed_area_ha, Depth_mean_m, Volume_m3,
         BlueLineOutlet) %>%
  left_join(x = ., y = match_sites,
            by = c("Lake" = "old_name")) %>%
  select(Park_code, site_code, everything(), -Lake) %>%
  full_join(x = ., y = bigjoin_subset, by = c("Park_code" = "park_code",
                                              "site_code"))

PCA

Scale a numeric subset of the data

physio_scale <- physio_subset %>%
  select(Elevation_m:snowice) %>%
  mutate_if(.predicate = is.numeric,
            .funs = ~ scale(.)) %>%
  cbind(Park_code = physio_subset$Park_code, .)

row.names(physio_scale) <- paste0(physio_subset$Park_code,
                                  "_", physio_subset$site_code)

Run PCA for numeric variables

physio_pca <- rda(na.omit(select_if(.tbl = physio_scale,
                                    .predicate = is.numeric)))

PCA Results:

physio_pca
## Call: rda(X = na.omit(select_if(.tbl = physio_scale, .predicate =
## is.numeric)))
## 
##               Inertia Rank
## Total              14     
## Unconstrained      14   14
## Inertia is variance 
## 
## Eigenvalues for unconstrained axes:
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10  PC11  PC12  PC13 
## 3.993 3.613 2.054 1.625 0.889 0.552 0.515 0.330 0.248 0.070 0.057 0.031 0.016 
##  PC14 
## 0.005
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c

Add hulls (currently for park)

# https://rpubs.com/brouwern/veganpca#:~:text=PCA%20(Principal%20Components%20Analysis)%20is,is%20also%20developing%20biplot%20tools).
biplot(physio_pca,
       display = c("sites", 
                   "species"),
       type = c("text",
                "points"))
ordihull(physio_pca,
         group = physio_scale$Park_code)
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c

Run PCA outside of vegan for another plotting option

physio_pca <- prcomp(x = select_if(.tbl = physio_scale,
                                   .predicate = is.numeric),
                     scale. = FALSE)

PCA Results:

physio_pca
## Standard deviations (1, .., p=14):
##  [1] 1.99832728 1.90086537 1.43332969 1.27465404 0.94303813 0.74290374
##  [7] 0.71778513 0.57484149 0.49816889 0.26426417 0.23940289 0.17599454
## [13] 0.12835909 0.06756623
## 
## Rotation (n x k) = (14 x 14):
##                                            PC1         PC2         PC3
## Elevation_m                        0.098718984  0.23475184 -0.34416855
## Watershed_area_to_lake_area_ratio  0.449796087 -0.17888412  0.06708688
## Surface_area_ha                   -0.320374605 -0.37868613 -0.04928645
## Watershed_area_ha                  0.152014455 -0.38729584  0.09538898
## Depth_mean_m                      -0.314680520 -0.13136245 -0.30725730
## Volume_m3                         -0.318831925 -0.33904928 -0.13481002
## solar_jas                         -0.174628564  0.31506057 -0.11295205
## SWE_May                            0.108484795 -0.23484318 -0.19312744
## flush_index_noSWE                  0.444641918 -0.17602530  0.10130398
## forest                            -0.207463643  0.22041638  0.46708998
## shrub                             -0.226003703 -0.33578277  0.02934696
## meadow                             0.094780455 -0.36965147  0.21862863
## barren                             0.344040834  0.02483731 -0.38294310
## snowice                           -0.008680226 -0.05734407 -0.52833824
##                                           PC4         PC5         PC6
## Elevation_m                        0.41258776 -0.31740973  0.33363865
## Watershed_area_to_lake_area_ratio -0.11522654  0.07388409  0.16652979
## Surface_area_ha                    0.04503952 -0.02927842  0.23142526
## Watershed_area_ha                  0.21859443  0.14297311  0.44632710
## Depth_mean_m                      -0.35718999  0.11414038  0.18644067
## Volume_m3                         -0.13802930 -0.09973187  0.30233727
## solar_jas                          0.48473494  0.18402586  0.15053886
## SWE_May                            0.28475866  0.74122416 -0.12510095
## flush_index_noSWE                 -0.12700007  0.03565136  0.06481845
## forest                            -0.21198410  0.13492904  0.36129182
## shrub                              0.36797393 -0.23266276 -0.23461962
## meadow                             0.16343962 -0.36613960 -0.24178192
## barren                            -0.09451794 -0.24024043  0.31004156
## snowice                           -0.26809072  0.02929043 -0.31518814
##                                           PC7         PC8         PC9
## Elevation_m                       -0.07616023  0.08100936 -0.57893303
## Watershed_area_to_lake_area_ratio -0.04847590 -0.11552086 -0.07604728
## Surface_area_ha                    0.09206120 -0.21046271  0.08495574
## Watershed_area_ha                  0.48202756  0.02421180  0.12884070
## Depth_mean_m                      -0.20906331  0.43722376 -0.01275448
## Volume_m3                         -0.18023234 -0.06780204 -0.23499543
## solar_jas                          0.25123441  0.11776102  0.20089734
## SWE_May                           -0.24952486  0.15329918 -0.08765963
## flush_index_noSWE                 -0.03546764 -0.15681991 -0.34007720
## forest                             0.21615457  0.02549403 -0.13505252
## shrub                             -0.14145797 -0.41986698  0.07583023
## meadow                             0.15021705  0.69774804 -0.02454281
## barren                            -0.13544794  0.01670222  0.59789292
## snowice                            0.66373859 -0.11584307 -0.18462463
##                                          PC10         PC11         PC12
## Elevation_m                        0.08851658 -0.110103587  0.205424087
## Watershed_area_to_lake_area_ratio  0.02760596  0.495118264 -0.151998547
## Surface_area_ha                   -0.29885911  0.011215411  0.094717801
## Watershed_area_ha                  0.37964207 -0.336082768 -0.119946146
## Depth_mean_m                       0.53470482  0.269820102 -0.001871218
## Volume_m3                         -0.43727170 -0.001883573 -0.292924661
## solar_jas                         -0.14821476  0.544681169 -0.315502190
## SWE_May                           -0.19191584 -0.082498641  0.326740012
## flush_index_noSWE                 -0.01224291  0.274909842 -0.052099875
## forest                            -0.09816626  0.202598487  0.593683679
## shrub                              0.35071759  0.325059658  0.316915890
## meadow                            -0.21504272  0.133869628  0.114302137
## barren                            -0.19400039  0.037022736  0.350847247
## snowice                           -0.07330691  0.108584281  0.165712590
##                                           PC13         PC14
## Elevation_m                       -0.006973226 -0.157066163
## Watershed_area_to_lake_area_ratio  0.374863403 -0.532641187
## Surface_area_ha                   -0.549383218 -0.484624706
## Watershed_area_ha                  0.108319969  0.132759306
## Depth_mean_m                      -0.142039730 -0.003808667
## Volume_m3                          0.398023291  0.342155721
## solar_jas                         -0.082231367  0.153925841
## SWE_May                            0.072913414  0.021677342
## flush_index_noSWE                 -0.544157066  0.472646941
## forest                             0.138575947  0.092570128
## shrub                              0.165214727  0.179668626
## meadow                             0.008659048 -0.004745226
## barren                             0.022414455  0.181368767
## snowice                            0.109608306  0.026963149

Scree plot

fviz_eig(physio_pca)

Bioplot & variable contribution

fviz_pca_biplot(X = physio_pca, col.var = viridis(n = 5)[2],
                col.ind = plasma(n = 5)[4], ggtheme = theme_bw())


fviz_pca_var(X = physio_pca, col.var = "contrib", repel = TRUE,
             gradient.cols = viridis(n = 5, begin = 0, end = 1),
             ggtheme = theme_bw())
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c

Regression tree for chlorophyll ~ static vars

mean_chl <- bigjoin %>%
  select(park_code, site_code, event_year, variable, value) %>%
  filter(variable == "Chlorophyll") %>%
  spread(key = variable, value = value) %>%
  group_by(park_code, site_code) %>%
  summarise(mean_chl = mean(Chlorophyll))
<<<<<<< HEAD
## `summarise()` regrouping output by 'park_code' (override with `.groups` argument)
=======
## `summarise()` regrouping output by 'park_code' (override with `.groups` argument)
>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
reg_tree_data <- physio_subset %>%
  full_join(x = ., y = mean_chl, by = c("Park_code" = "park_code",
                                        "site_code"))

Specify the tree, setting minimum number of obs per node before a split to be 10

tree <- rpart(formula = mean_chl ~ Elevation_m +  Watershed_area_to_lake_area_ratio +
                Depth_mean_m + forest + barren + solar_jas + Volume_m3 +
                SWE_May + flush_index_noSWE,
              data = reg_tree_data, minsplit = 10)

plot(x = as.party(tree),
     main = "Chlorophyll",
     terminal_panel = node_boxplot)
<<<<<<< HEAD

plotcp(tree)

ptree <- prune(tree, cp = 0.025);
rpart.plot(ptree, main="Average Chlorophyll")

=======

plotcp(tree)

ptree <- prune(tree, cp = 0.025);
rpart.plot(ptree, main="Average Chlorophyll")

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
png("NPS_FCA_chl_tree.png", width=1000, height=800)

Based on the cross-validation plot, the ideal tree size is 1...

Slopes of (top 2m water temp ~ SNOTEL/DAS) ~ static vars

Import Power's data object from his analysis script (v32, ~ line 604)

powers_profile_data <- readRDS(file = "../data/analysis_outputs/sp_dataplot.rds")

powers_profile_data <- powers_profile_data %>%
  separate(data = .,
           col = park_site,
           into = c("park", "site"),
           sep = " ",
           extra = "merge")

plot_wtempSWE_snotel <- ggplot(data = powers_profile_data,
                               aes(x = SWE_May_snotel, y = value, group = "black",
                                   color = (park_code), label = short_code)) +
  geom_line(aes(group = short_code), alpha = 0.5)+
  geom_text_repel(data = filter(powers_profile_data, snowyrlo_snotel == 1),
                  size = 2.5, segment.color = "black", alpha = 1,
                  segment.size = 0.2, box.padding = 0.1) + 
  xlab("May SWE (cm), SNOTEL")+
  ylab(paste0("Water temperature (", intToUtf8(176), "C), top 2m"))+
  theme_bw()+
  theme(legend.position = "none")+
  theme(strip.text.x = element_text(),
        axis.title.x = element_text(vjust = -0.5)) + 
  scale_colour_viridis_d(end = 0.85) +
  xlim(x = c(0, 325))

plot_wtempSWE_snodas <- ggplot(powers_profile_data,
                               aes(x = SWE_May_snodas, y = value,
                                   group = "black", color = (park_code),
                                   label = short_code)) +
  geom_line(aes(group = short_code), alpha = 0.5)+
  geom_text_repel(data = filter(powers_profile_data, snowyrlo_snotel == 1),
                  size = 2.5, segment.color = "black", alpha = 1,
                  segment.size = 0.2, box.padding = 0.1) + 
  xlab("May SWE (cm), SNODAS") +
  ylab(paste0("Water temperature (", intToUtf8(176), "C), top 2m"))+
  theme_bw() +
  theme(legend.position = "none") +
  theme(strip.text.x = element_text(),
        axis.title.x = element_text(vjust = -0.5)) + 
  scale_colour_viridis_d(end = 0.85) +
  xlim(x = c(0, 325))
lake_list <- split(powers_profile_data, f = powers_profile_data$short_code)

snodas_models <- map_df(.x = lake_list,
                        .f = ~ {
                          
                          model_output <- lm(formula = value  ~ SWE_May_snodas,
                                             data = .x)$coefficients[2]
                          
                          output <- tibble(
                            park = unique(.x$park),
                            site = unique(.x$site),
                            slope = model_output)
                          
                          return(output)
                          
                        })

snotel_models <- map_df(.x = lake_list,
                        .f = ~ {
                          
                          model_output <- lm(formula = value  ~ SWE_May_snotel,
                                             data = .x)$coefficients[2]
                          
                          output <- tibble(
                            park = unique(.x$park),
                            site = unique(.x$site),
                            slope = model_output)
                          
                          return(output)
                          
                        })
# Going to recreate SP's plot but using actual lm() outputs to make sure that
# the regression coefficient values are accurate:
# https://stackoverflow.com/questions/44865508/using-ggplot2-to-plot-an-already-existing-linear-model

snodas_predict <-  map_df(.x = lake_list,
                          .f = ~ {
                            
                            model_output <- lm(formula = value  ~ SWE_May_snodas,
                                               data = .x)
                            
                            output <- cbind(.x,
                                            fit_source = "snodas",
                                            fit = predict(model_output),
                                            slope = round(model_output$coefficients[2],
                                                          digits = 2))
                            
                            return(output)
                            
                          })

snodas_predict_plot <- snodas_predict %>%
  ggplot(data = .,
         aes(x = SWE_May_snodas, y = fit,
             group = "black", color = (park_code))) +
  geom_line(aes(group = short_code), alpha = 0.5)+
  geom_text_repel(data = filter(snodas_predict, snowyrlo_snotel == 0),
                  size = 2.5, segment.color = "black", alpha = 1,
                  segment.size = 0.2, box.padding = 0.1,
                  mapping = aes(label = paste0(short_code, "_", slope))) + 
  xlab("May SWE (cm), SNODAS") +
  ylab(paste0("Water temperature (", intToUtf8(176), "C), top 2m"))+
  theme_bw() +
  theme(legend.position = "none") +
  theme(strip.text.x = element_text(),
        axis.title.x = element_text(vjust = -0.5)) + 
  scale_colour_viridis_d(end = 0.85) +
  xlim(x = c(0, 325))

snotel_predict <-  map_df(.x = lake_list,
                          .f = ~ {
                            
                            model_output <- lm(formula = value  ~ SWE_May_snotel,
                                               data = .x)
                            
                            output <- cbind(.x,
                                            fit_source = "snotel",
                                            fit = predict(model_output),
                                            slope = round(model_output$coefficients[2],
                                                          digits = 2))
                            
                            return(output)
                            
                          })

snotel_predict_plot <- snotel_predict %>%
  ggplot(data = .,
         aes(x = SWE_May_snotel, y = fit,
             group = "black", color = (park_code))) +
  geom_line(aes(group = short_code), alpha = 0.5)+
  geom_text_repel(data = filter(snotel_predict, snowyrlo_snotel == 0),
                  size = 2.5, segment.color = "black", alpha = 1,
                  segment.size = 0.2, box.padding = 0.1,
                  mapping = aes(label = paste0(short_code, "_", slope))) + 
  xlab("May SWE (cm), snotel") +
  ylab(paste0("Water temperature (", intToUtf8(176), "C), top 2m"))+
  theme_bw() +
  theme(legend.position = "none") +
  theme(strip.text.x = element_text(),
        axis.title.x = element_text(vjust = -0.5)) + 
  scale_colour_viridis_d(end = 0.85) +
  xlim(x = c(0, 325))

ggarrange(plot_wtempSWE_snodas, snodas_predict_plot,
          plot_wtempSWE_snotel, snotel_predict_plot)
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
# First join coefficient datasets, then update site names, then add
# the static variables
swe_modeling_data <- full_join(x = snodas_models, y = snotel_models,
                               by = c("park", "site"),
                               suffix = c(.x = "_snodas", .y = "_snotel")) %>%
  left_join(x = ., y = match_sites, by = c("site" = "old_name")) %>%
  left_join(x = ., y = physio_subset, by = c("park" = "Park_code",
                                             "site_code"))
# Set up a tree using all variables, but leave out slope_snodas as a predictor
snodas_tree <- rpart(formula = slope_snodas ~ Elevation_m +
                       Watershed_area_to_lake_area_ratio + Surface_area_ha +
                       Watershed_area_ha + Depth_mean_m + forest + shrub + meadow +
                       barren + snowice,
                     data = swe_modeling_data, minsplit = 10)

plot(x = as.party(snodas_tree),
     main = "SNODAS Slopes",
     terminal_panel = node_boxplot)

plotcp(snodas_tree)
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
snotel_tree <- rpart(formula = slope_snodas ~ Elevation_m +
                       Watershed_area_to_lake_area_ratio + Depth_mean_m + forest + 
                       barren,
                     data = swe_modeling_data, minsplit = 10)

plot(x = as.party(snotel_tree),
     main = "SNOTEL Slopes",
     terminal_panel = node_boxplot)

plotcp(snotel_tree)
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
# If the snotel model is in the ballpark, it’s highest elevation
# forested lakes with shallow depths that have the anomalous slopes

Comparison of weekly (continuous) surface temp vs profile temp @ top 2m

Note that weekly temps are not calculated around a specific point, but based on the calendar year. So if we'd like to compare means of weeks centered, e.g., on the date a profile measurement was taken, we'll have to change this. This might show what it would take.

daily_temps <- read.csv(file = "../data/analysis_outputs/all-daily-temp-summaries.csv",
                        stringsAsFactors = FALSE) %>%
  select(obs_year:mean_value) %>%
  filter(measure == "SurfaceTemp") %>%
  left_join(x = ., y = match_sites, by = c("lake" = "old_name")) %>%
  select(park, site_code, everything(), -lake)

profile_data <- read_excel(
  path = file.path("..",
                   "data",
                   "NPS_NCCN_Mtn_Lakes_Exports",
                   "qs_b364_Water_Column_Profile_Data_20200723_160058.xlsx"),
  col_types = c("text", "text", "text", "numeric", "date", "logical", "numeric",
                "numeric", "date", "text", "logical", "numeric", "text", "text",
                "text", "date", "text"))%>%
  left_join(x = ., y = match_sites, by = c("Site_name" = "old_name")) %>%
  select(Park_code, site_code, everything(), -Site_code) %>%
  mutate(week_number = week(Start_date))


profile_dates <- profile_data %>%
  select(Park_code, site_code, Start_date, Event_year, week_number)

# Summarize daily temps by week, only for weeks that match to profile temp
# measurement events
weekly_temps <- daily_temps %>%
  mutate(new_date = ymd(truncated = 1,
                        paste(obs_year, obs_month, obs_day, sep = "-")),
         week_number = week(new_date)) %>%
  semi_join(x = ., y = profile_dates, by = c("park" = "Park_code",
                                             "site_code",
                                             "obs_year" = "Event_year",
                                             "week_number")) %>%
  group_by(park, site_code, obs_year, week_number) %>%
  summarise(mean_surf_temp = mean(mean_value))
<<<<<<< HEAD
## `summarise()` regrouping output by 'park', 'site_code', 'obs_year' (override with `.groups` argument)
# Pull the profile temp vals from bigjoin (where they've been aggregated already)
=======
## `summarise()` regrouping output by 'park', 'site_code', 'obs_year' (override with `.groups` argument)
# Pull the profile temp vals from bigjoin (where they've been aggregated already)
>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
profile_temps <- bigjoin %>%
  select(park_code, site_code, start_date, event_year, variable, value) %>%
  filter(variable == "ProfTemp_top2m") %>%
  mutate(week_number = week(start_date))

# Join daily and profile temps based on week in which they occurred
joined_temps <- full_join(x = weekly_temps, y = profile_temps,
                          by = c("park" = "park_code",
                                 "site_code",
                                 "obs_year" = "event_year",
                                 "week_number")) %>%
  spread(key = variable, value = value) %>%
  # Remove Hoh, because of lake of continuous data (expected, right?)
  filter(site_code != "Hoh")

ggplot(data = joined_temps) +
  geom_point(aes(x = ProfTemp_top2m, y = mean_surf_temp, color = site_code))
## Warning: Removed 12 rows containing missing values (geom_point).
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c
ggplot(data = joined_temps) +
  geom_point(aes(x = ProfTemp_top2m, y = mean_surf_temp)) +
  facet_wrap(. ~ site_code) +
  geom_smooth(method = "lm", aes(x = ProfTemp_top2m, y = mean_surf_temp),
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).

## Warning: Removed 12 rows containing missing values (geom_point).
<<<<<<< HEAD

=======

>>>>>>> 3085f632f26a1c9781c85434701e49bcd3e2183c